Understand which values within the metadata are correlated. Make sure you understand how these things are related in reality and mathematically.

Settup

Libraries

tellme <- function(name){message("Package ", name, " version: ", packageVersion(name))}

library(tidyr); tellme("tidyr")
## Package tidyr version: 1.3.0
suppressPackageStartupMessages(library(dplyr)); tellme("dplyr")
## Package dplyr version: 1.1.2
library(ggplot2); tellme("ggplot2")
## Package ggplot2 version: 3.4.2

Direct output.

Output will be saved to: ../output

Read metadata

Get the per-particpant metadata.

pipeline = "../../"
prevModule = dir(path=pipeline, pattern="Participant_Metadata", full.names = TRUE)
fileName = file.path(prevModule, "output", "ANIGMA-metadata_by_AN_participant.txt")
diffs = read.delim(fileName)

Read in data from file: ../..//01_Participant_Metadata/output/ANIGMA-metadata_by_AN_participant.txt

This table has 118 rows and 25 columns.

Read per-sample data.

meta = read.delim("../../input/meta/ANIGMA-metadata.txt") %>%
  select(PARTICIPANT.ID, LOCATION, TIMEPOINT, AGE, SUBTYPE, BMI, 
         STAI_Y1, STAI_Y2, STAI_TOTAL, PSS, DAYS_TREAT, Weight_kg, 
         DUR_ILLNESS_YRS)
dim(meta)
## [1] 352  13

This meta data has 352 rows and 13 columns.

The HC equivalent data frame:

HC = meta %>% filter(TIMEPOINT=="HC") %>%
  select(PARTICIPANT.ID, LOCATION, AGE, SUBTYPE, STAI_Y1, STAI_Y2, STAI_TOTAL, PSS, BMI, Weight_kg)

This subset of the data has 136 rows and 10 columns.

Visualize broad view correlations

Create a correlation matrix.

df = diffs %>% select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE)
cor.co = cor(df, use="pairwise.complete.obs", method = "pearson")
# much of this is taken from
# http://www.sthda.com/english/wiki/ggplot2-quick-correlation-matrix-heatmap-r-software-and-data-visualization

#functions
tidyMelt <- function(cormat, values_name="values"){
    cormat.new = cormat %>% data.frame() 
    cormat.new$var1 = row.names(cormat.new)
    melted_cormat = cormat.new %>% pivot_longer(cols=-var1, names_to = "var2", values_to = values_name)
    return(melted_cormat)
}

make_heatmap_1 <- function(cor.co2){
    ggplot(data = cor.co2, aes(x=var1, y=var2, fill=cor)) + 
        geom_tile(color = "white")+
        scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                             midpoint = 0, limit = c(-1,1), space = "Lab", 
                             name="Pearson\nCorrelation",
                             na.value = gray(.9)) +
        theme_minimal()+ # minimal theme
        xlab("") +
        ylab("") +
        theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                         size = 8, hjust = 1)) +
        coord_fixed()
}

cor.co2 = tidyMelt(cor.co, values_name="cor")

make_heatmap_1(cor.co2)

Use only the upper triangle, and order features in a sensible way.

# actions

# Reorder the correlation matrix
cormat <- reorder_cormat(cor.co)
# upper_tri <- get_upper_tri(cormat)
triangle1 = get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat = tidyMelt(triangle1, values_name="cor")
# order levels to match triangle
melted_cormat$var1 = factor(melted_cormat$var1, levels=row.names(triangle1))
melted_cormat$var2 = factor(melted_cormat$var2, levels=row.names(triangle1))

# Create a ggheatmap
make_heatmap_1(melted_cormat)

Manually set the order.

manual.order = c('BMI.T1', 'Weight_kg.T1', 'BMI.T2', 'Weight_kg.T2',
                 'T1.severity', 'DAYS_TREAT', 'BMI.diff', 'Weight_kg.diff', 
                 'AGE', 'DUR_ILLNESS_YRS', 
                 'PSS.diff', 'STAI_Y1.diff', 'STAI_Y2.diff', 'STAI_TOTAL.diff', 
                 'PSS.T2', 
                 'STAI_Y2.T1', 'PSS.T1', 'STAI_Y1.T1', 'STAI_TOTAL.T1', 'STAI_Y1.T2', 'STAI_Y2.T2', 'STAI_TOTAL.T2')
manual.order
##  [1] "BMI.T1"          "Weight_kg.T1"    "BMI.T2"          "Weight_kg.T2"   
##  [5] "T1.severity"     "DAYS_TREAT"      "BMI.diff"        "Weight_kg.diff" 
##  [9] "AGE"             "DUR_ILLNESS_YRS" "PSS.diff"        "STAI_Y1.diff"   
## [13] "STAI_Y2.diff"    "STAI_TOTAL.diff" "PSS.T2"          "STAI_Y2.T1"     
## [17] "PSS.T1"          "STAI_Y1.T1"      "STAI_TOTAL.T1"   "STAI_Y1.T2"     
## [21] "STAI_Y2.T2"      "STAI_TOTAL.T2"

Replot with manual order.

# Reorder the correlation matrix
cormat <- cor.co[manual.order, manual.order]

# upper_tri <- get_upper_tri(cormat)
triangle1 = get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat = tidyMelt(triangle1, values_name="cor")
# order levels to match manual order
melted_cormat$var1 = factor(melted_cormat$var1, levels=row.names(triangle1))
melted_cormat$var2 = factor(melted_cormat$var2, levels=row.names(triangle1))

# Create a ggheatmap
make_heatmap_1(melted_cormat)

Now lets get some p-values.

feats = colnames(df)

pvalCut = 0.01

cors = matrix(data=NA, nrow=length(feats), ncol=length(feats), dimnames = list(feats, feats))
pvals = cors
cors.sig = cors

for (i in feats){
    for (j in feats){
        if ( i != j ){
            res = cor.test(df[,i], df[,j], method="pearson")
            cors[i,j] = res$estimate
            pvals[i,j] = res$p.value
            if (res$p.value < pvalCut){
                cors.sig[i,j] = res$estimate
            }
        }
    }
}

Make a new plot that only includes values were the correlation was significant (p < 0.01).

reshapeMatrixToPlot <- function(correlationMatrix){
    # Reorder the correlation matrix
    cormat <- correlationMatrix[manual.order, manual.order]
    
    # only one triangle
    triangle1 = get_upper_tri(cormat)
    
    # Melt the correlation matrix
    melted_cormat = tidyMelt(triangle1, values_name="cor")
    
    # order levels to match triangle order
    melted_cormat$var1 = factor(melted_cormat$var1, levels=row.names(triangle1))
    melted_cormat$var2 = factor(melted_cormat$var2, levels=row.names(triangle1))
    
    return(melted_cormat)
}

# match most recent plot - show melt and plot method matches
make_heatmap_1(reshapeMatrixToPlot(cor.co)) + ggtitle("Upper triangle")

# should also match most recent plot - show input matrix matches
make_heatmap_1(reshapeMatrixToPlot(cors)) + ggtitle("Upper triangle, excluding identity line")

# only sig
make_heatmap_1(reshapeMatrixToPlot(cors.sig)) + ggtitle(paste0("Correlations with p-value less than ", pvalCut))

Now lets note what is mathmatically related.

# T1 severity is related to BMI at T1 and therefore also related to weight at T1
severity = data.frame(x=c("T1.severity", "T1.severity"), 
                      y=c("BMI.T1", "Weight_kg.T1"))

related = data.frame(x=c("STAI_TOTAL", "STAI_TOTAL", "BMI"),
                     y=c("STAI_Y1", "STAI_Y2", "Weight_kg"))
# these are related within a given time point
related1 = data.frame(x=paste(related$x, "T1", sep="."),
                      y=paste(related$y, "T1", sep="."))
related2 = data.frame(x=paste(related$x, "T2", sep="."),
                      y=paste(related$y, "T2", sep="."))

# by extension, the diff of each thing is related to each time point of the other thing
related1.diff = data.frame(x=paste(related$x, "T1", sep="."),
                      y=paste(related$y, "diff", sep="."))
related1.diff.rev = data.frame(x=paste(related$x, "diff", sep="."),
                      y=paste(related$y, "T1", sep="."))
related2.diff = data.frame(x=paste(related$x, "T2", sep="."),
                      y=paste(related$y, "diff", sep="."))
related2.diff.rev = data.frame(x=paste(related$x, "diff", sep="."),
                      y=paste(related$y, "T2", sep="."))

# all diffables have a relationship between their diff and each time point
# diffables = c("STAI_Y1", "STAI_Y2", "STAI_TOTAL", "PSS", "BMI", "Weight_kg")
diffables = gsub(grep(".diff", names(diffs), value = T), pattern=".diff", replacement="")
relatedDiff1 = data.frame(x=paste(diffables, "T1", sep="."),
                         y=paste(diffables, "diff", sep="."))
relatedDiff2 = data.frame(x=paste(diffables, "T2", sep="."),
                         y=paste(diffables, "diff", sep="."))

stack = rbind(related1, related2, 
              related1.diff, related1.diff.rev, 
              related2.diff, related2.diff.rev, 
              relatedDiff1, relatedDiff2, 
              severity)

# relatedness goes both ways
reverse.stack = data.frame(x=stack$y, y=stack$x)
stack2 = rbind(stack, reverse.stack)

head(stack2)
##               x            y
## 1 STAI_TOTAL.T1   STAI_Y1.T1
## 2 STAI_TOTAL.T1   STAI_Y2.T1
## 3        BMI.T1 Weight_kg.T1
## 4 STAI_TOTAL.T2   STAI_Y1.T2
## 5 STAI_TOTAL.T2   STAI_Y2.T2
## 6        BMI.T2 Weight_kg.T2

test annotations

make_heatmap_1(reshapeMatrixToPlot(cor.co)) + 
    # mark related features
    annotate(geom="text", x=stack2$x, y=stack2$y, label="+") +
    # identity line gets similar marking, slighly bigger
    annotate(geom="text", x=feats, y=feats, label="+", size=6)

Finalize figure

Show all correlations on the upper triangle but in the lower triangle show only the significant ones.

# upper - all correlation values
# Reorder the correlation matrix
cormat <- cor.co[manual.order, manual.order]
# only one triangle
triangle1 = get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat1 = tidyMelt(triangle1, values_name="cor") %>% filter(!is.na(cor))
# order levels to match triangle order
melted_cormat1$var1 = factor(melted_cormat1$var1, levels=manual.order)
melted_cormat1$var2 = factor(melted_cormat1$var2, levels=manual.order)

# lower - only sig values
cormat <- cors.sig[manual.order, manual.order]
# only one triangle
get_lower_tri2<-function(cormat){
    cormat[upper.tri(cormat)] <- 50
    for (f in feats){
        cormat[f, f] <- 50
    }
    return(cormat)
}
triangle2 = get_lower_tri2(cormat)

# Melt the correlation matrix
melted_cormat2 = tidyMelt(triangle2, values_name="cor") %>% filter(cor != 50)
# order levels to match triangle order
melted_cormat2$var1 = factor(melted_cormat2$var1, levels=manual.order)
melted_cormat2$var2 = factor(melted_cormat2$var2, levels=manual.order)

# merge in melted form
melted_cormat = rbind(melted_cormat1, melted_cormat2)
# order levels to match triangle order
melted_cormat$var1 = factor(melted_cormat$var1, levels=manual.order)
melted_cormat$var2 = factor(melted_cormat$var2, levels=manual.order)

figure = make_heatmap_1(melted_cormat) + 
    # mark related features
    annotate(geom="text", x=stack2$x, y=stack2$y, label="+") +
    # identity line gets similar marking, slighly bigger
    annotate(geom="text", x=feats, y=feats, label="+", size=6)
print(figure)

Figure Variants

The objects we need moving forward are:

Remove everything else.

rm(list = setdiff(ls(), c("manual.order", "stack2", "diffs", "HC", "outdir")))

As a function:

#functions
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
    cormat[lower.tri(cormat)]<- NA
    return(cormat)
}

tidyMelt <- function(cormat, values_name="values"){
    cormat.new = cormat %>% data.frame() 
    cormat.new$var1 = row.names(cormat.new)
    melted_cormat = cormat.new %>% pivot_longer(cols=-var1, names_to = "var2", values_to = values_name)
    return(melted_cormat)
}

make_heatmap_1 <- function(cor.co2){
    ggplot(data = cor.co2, aes(x=var1, y=var2, fill=cor)) + 
        geom_tile(color = "white")+
        scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                             midpoint = 0, limit = c(-1,1), space = "Lab", 
                             name="Pearson\nCorrelation",
                             na.value = gray(.9)) +
        theme_minimal()+ # minimal theme
        xlab("") +
        ylab("") +
        theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                         size = 8, hjust = 1)) +
        coord_fixed()
}

make_heatmap_2 <- function(dataframe, my.order=manual.order, markPairings=stack2, pvalCut=0.01,
                           cor.mat.file=NULL, pval.mat.file=NULL){
    if (is.null(my.order)){
        my.order = names(dataframe)
    }
    
    feats = colnames(dataframe)
    
    if (!is.null(markPairings)){
        names(markPairings) = c("x", "y")
        markPairings = markPairings %>% 
            filter( x %in% feats) %>%
            filter( y %in% feats)
    }
    
    cor.co = cor(dataframe, use="pairwise.complete.obs", method = "pearson")
    if (!is.null(cor.mat.file)){
        write.table(cbind(feature=row.names(cor.co), cor.co), 
                    file=cor.mat.file, 
                    sep="\t", quote=F, row.names = F, col.names = T)
    }
    
    cors = matrix(data=NA, nrow=length(feats), 
                  ncol=length(feats), 
                  dimnames = list(feats, feats))
    pvals = cors
    cors.sig = cors
    
    for (i in feats){
        for (j in feats){
            if ( i != j ){
                res = cor.test(dataframe[,i], dataframe[,j], method="pearson")
                pvals[i,j] = res$p.value
                if (res$p.value < pvalCut){
                    cors.sig[i,j] = res$estimate
                }
            }
        }
    }
    
    if (!is.null(pval.mat.file)){
        write.table(cbind(feature=row.names(pvals), pvals), 
                    file=pval.mat.file, 
                    sep="\t", quote=F, row.names = F, col.names = T)
    }
    
    # upper - all correlation values
    # Reorder the correlation matrix
    cormat <- cor.co[my.order, my.order]
    # only one triangle
    triangle1 = get_upper_tri(cormat)
    # Melt the correlation matrix
    melted_cormat1 = tidyMelt(triangle1, values_name="cor") %>% filter(!is.na(cor))
    # order levels to match triangle order
    melted_cormat1$var1 = factor(melted_cormat1$var1, levels=my.order)
    melted_cormat1$var2 = factor(melted_cormat1$var2, levels=my.order)
    
    # lower - only sig values
    cormat <- cors.sig[my.order, my.order]
    # only one triangle
    get_lower_tri2<-function(cormat){
        cormat[upper.tri(cormat)] <- 50
        for (f in feats){
            cormat[f, f] <- 50
        }
        return(cormat)
    }
    triangle2 = get_lower_tri2(cormat)
    
    # Melt the correlation matrix
    melted_cormat2 = tidyMelt(triangle2, values_name="cor") %>% filter(cor != 50)
    # order levels to match triangle order
    melted_cormat2$var1 = factor(melted_cormat2$var1, levels=my.order)
    melted_cormat2$var2 = factor(melted_cormat2$var2, levels=my.order)
    
    # merge in melted form
    melted_cormat = rbind(melted_cormat1, melted_cormat2)
    # order levels to match triangle order
    melted_cormat$var1 = factor(melted_cormat$var1, levels=my.order)
    melted_cormat$var2 = factor(melted_cormat$var2, levels=my.order)
    
    figure = make_heatmap_1(melted_cormat)
    
    if (!is.null(markPairings)){
        figure = figure + 
            # mark related features
            annotate(geom="text", x=markPairings[,1], y=markPairings[,2], label="+") +
            # identity line gets similar marking, slighly bigger
            annotate(geom="text", x=feats, y=feats, label="+", size=6)
    }
    
    return(figure)
}

fig1 = make_heatmap_2(diffs %>% select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE),
                      cor.mat.file = file.path(outdir, "correlations_AN_cor.txt"),
                      pval.mat.file = file.path(outdir, "correlations_AN_pval.txt"))
fig1

Save figure to file.

ggsave(filename = file.path(outdir, "correlations_AN.png"),
       plot = fig1)
## Saving 7 x 5 in image

Add LOCATION and SUBTYPE as a row.

make_heatmap_2(diffs %>% 
                   mutate(LOC.N = as.numeric(factor(LOCATION))) %>%
                   mutate(TYPE.N = as.numeric(factor(SUBTYPE))) %>%
                   select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE),
               my.order = c( "TYPE.N", "LOC.N", manual.order))

AN Subtypes

s1 = make_heatmap_2(diffs %>%
                   filter(SUBTYPE == "BP") %>%
                   select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE)) +
    ggtitle("BP")

s2 = make_heatmap_2(diffs %>%
                   filter(SUBTYPE == "ANR") %>%
                   select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE)) +
    ggtitle("ANR")

## not enough of these to plot
# 
# make_heatmap_2(diffs %>%
#                    filter(SUBTYPE == "EDNOS") %>%
#                    select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE)) +
#     ggtitle("EDNOS")
# 
# make_heatmap_2(diffs %>%
#                    filter(SUBTYPE == "ARFID") %>%
#                    select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE)) +
#     ggtitle("ARFID")
fileName.locations = file.path(outdir, "correlations_by-subtype.pdf")
pdf(fileName.locations)

s1
s2

dev.off()
## quartz_off_screen 
##                 2
s1

s2

Location subsets

Using the above function.

p1 = make_heatmap_2(diffs %>%
                   filter(LOCATION == "ACUTE") %>%
                   select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE)) +
    ggtitle("ACUTE")


# no ceed values for DUR_ILLNESS_YRS
p2 = make_heatmap_2(diffs %>%
                   filter(LOCATION == "CEED") %>%
                   select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE, -DUR_ILLNESS_YRS),
               my.order = setdiff(manual.order, "DUR_ILLNESS_YRS") ) +
    ggtitle("CEED")


diffs.FARGO = diffs %>%
    filter(LOCATION == "FARGO") %>%
    select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE) %>%
    select(-contains("STAI_Y2"), -contains("STAI_TOTAL") )
p3 = make_heatmap_2(diffs.FARGO,
               my.order=manual.order[manual.order %in% names(diffs.FARGO)],
               markPairings = stack2 %>% 
                   filter(x %in% names(diffs.FARGO)) %>%
                   filter(y %in% names(diffs.FARGO))) +
    ggtitle("FARGO")
fileName.locations = file.path(outdir, "correlations_by-location.pdf")
pdf(fileName.locations)

p1
p2
p3

dev.off()
## quartz_off_screen 
##                 2
p1

p2

p3

HC group

hc1 = make_heatmap_2(HC %>%
                   select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE),
               my.order = NULL, markPairings = stack2) +
    ggtitle("Non-eating Disorder")
hc1

hc2 = make_heatmap_2(HC %>%
                   filter(LOCATION == "ACUTE") %>%
                   select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE),
               my.order = NULL) +
    ggtitle("Non-eating Disorder - ACUTE")


hc3 = make_heatmap_2(HC %>%
                   filter(LOCATION == "CEED") %>%
                   select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE),
               my.order = NULL) +
    ggtitle("Non-eating Disorder - CEED")

hc4 = make_heatmap_2(HC %>%
                   filter(LOCATION == "FARGO") %>%
                   select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE) %>%
                   select(-contains("STAI_Y2"), -contains("STAI_TOTAL")),
               my.order = NULL) +
    ggtitle("Non-eating Disorder - FARGO")
fileName.locations = file.path(outdir, "correlations_non-ed_HC.pdf")
pdf(fileName.locations)

hc1
hc2
hc3
hc4

dev.off()
## quartz_off_screen 
##                 2
hc2

hc3

hc4

sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.4.2 dplyr_1.1.2   tidyr_1.3.0  
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.3      jsonlite_1.8.4    highr_0.10        compiler_4.3.0   
##  [5] tidyselect_1.2.0  jquerylib_0.1.4   textshaping_0.3.6 systemfonts_1.0.4
##  [9] scales_1.2.1      yaml_2.3.7        fastmap_1.1.1     R6_2.5.1         
## [13] labeling_0.4.2    generics_0.1.3    knitr_1.42        tibble_3.2.1     
## [17] munsell_0.5.0     bslib_0.4.2       pillar_1.9.0      rlang_1.1.1      
## [21] utf8_1.2.3        cachem_1.0.8      xfun_0.39         sass_0.4.6       
## [25] cli_3.6.1         withr_2.5.0       magrittr_2.0.3    digest_0.6.31    
## [29] grid_4.3.0        rstudioapi_0.14   lifecycle_1.0.3   vctrs_0.6.2      
## [33] evaluate_0.21     glue_1.6.2        farver_2.1.1      ragg_1.2.5       
## [37] fansi_1.0.4       colorspace_2.1-0  rmarkdown_2.21    purrr_1.0.1      
## [41] tools_4.3.0       pkgconfig_2.0.3   htmltools_0.5.5